Mini Project 2

Introduction

For STA 9750 Mini Project 2, I’m going to propose remaking a classic movie: Independence Day.

Watch out, Earth!
Watch out, Earth!

Task 7: Pitch

Originally released in 1996 and starring Will Smith, this film depicts Earth’s fight for freedom from an alien invasion.

Starring Jacob Batalon in his breakout role as Captain Steven Hiller and famous robo-freedom-fighter John DiMaggio as David Levinson, this film follows an ensemble of characters as they look to save Earth from the new threat from the aliens of Omicron Persei 10.

Jacob Batalon
Jacob Batalon
John DiMaggio
John DiMaggio

Directed by Glass Onion and Knives Out director Rian Johnson, the action behind this film belies the mystery of the Omicronians and their true plot.

Rian Johnson
Rian Johnson

How we got here

Now that I’ve thoroughly grabbed your attention, how the heck did we come up with this cast of characters?

After sorting through IMDb data, these three just truly spoke to us and we knew we were on to something.

Task 0: In which the data is gathered

Since we’re doing this in R, let’s get some code going

# Install and import packages
if(!require("ggplot2")) install.packages("ggplot2")
if(!require("tidyverse")) install.packages("tidyverse")
if(!require("dplyr")) install.packages("dplyr")
if(!require("glue")) install.packages("glue")
if(!require("scales")) install.packages("scales")
if(!require("psych")) install.packages("psych")
if(!require("plotly")) install.packages("plotly")
if(!require("readr")) install.packages("readr")

library(ggplot2)
library(tidyverse)
library(dplyr)
library(glue)
library(scales)
library(psych)
library(plotly)
library(readr)

Next let’s import our data

get_imdb_file <- function(fname){
  BASE_URL <- "https://datasets.imdbws.com/"
  fname_ext <- paste0(fname, ".tsv.gz")
  if(!file.exists(fname_ext)){
    FILE_URL <- paste0(BASE_URL, fname_ext)
    download.file(FILE_URL, 
                  destfile = fname_ext)
  }
  as.data.frame(readr::read_tsv(fname_ext, lazy=FALSE))
}

NAME_BASICS      <- get_imdb_file("name.basics")

TITLE_BASICS     <- get_imdb_file("title.basics")

TITLE_EPISODES   <- get_imdb_file("title.episode")

TITLE_RATINGS    <- get_imdb_file("title.ratings")

TITLE_CREW       <- get_imdb_file("title.crew")

TITLE_PRINCIPALS <- get_imdb_file("title.principals")

NAME_BASICS <- NAME_BASICS |>
  filter(str_count(knownForTitles, ",") > 1)

Looking at the data we just pulled

TITLE_RATINGS |>
  ggplot(aes(x=numVotes)) + 
  geom_histogram(bins=30) +
  xlab("Number of IMDB Ratings") + 
  ylab("Number of Titles") + 
  ggtitle("Majority of IMDB Titles Have Less than 100 Ratings") + 
  theme_bw() + 
  scale_x_log10(label=scales::comma) + 
  scale_y_continuous(label=scales::comma)

TITLE_RATINGS |>
  pull(numVotes) |>
  quantile()
     0%     25%     50%     75%    100% 
      5      11      26     101 2952383 
TITLE_RATINGS <- TITLE_RATINGS |>
  filter(numVotes >= 100)

TITLE_BASICS <- TITLE_BASICS |>
  semi_join(TITLE_RATINGS, 
            join_by(tconst == tconst))

TITLE_CREW <- TITLE_CREW |>
  semi_join(TITLE_RATINGS, 
            join_by(tconst == tconst))

TITLE_EPISODES_1 <- TITLE_EPISODES |>
  semi_join(TITLE_RATINGS, 
            join_by(tconst == tconst))
TITLE_EPISODES_2 <- TITLE_EPISODES |>
  semi_join(TITLE_RATINGS, 
            join_by(parentTconst == tconst))

TITLE_EPISODES <- bind_rows(TITLE_EPISODES_1,
                            TITLE_EPISODES_2) |>
  distinct()

TITLE_PRINCIPALS <- TITLE_PRINCIPALS |>
  semi_join(TITLE_RATINGS, join_by(tconst == tconst))


rm(TITLE_EPISODES_1)
rm(TITLE_EPISODES_2)

#EDA
NAME_BASICS <- NAME_BASICS |>
  mutate(birthYear = as.numeric(birthYear),
         deathYear = as.numeric(deathYear))

We have several tasks to complete with this data, including some EDA!

Task 1: Clean Clean Clean

convert_columns <- function(data, columns, conversion = "numeric") {
  
  # Check if conversion is valid
  
  if(!conversion %in% c("numeric", "logical")) {
    stop("Invalid conversion type. Choose either 'numeric' or 'logical'.")
  }
  
  # Apply the conversion based on argument
  if (conversion == "numeric") {
    data <- data |>
      mutate(across(all_of(columns), as.numeric))
  } else if (conversion == "logical") {
   data <- data |>
     mutate(across(all_of(columns), as.logical))
  }
  
  return(data)
}

TITLE_BASICS <- convert_columns(TITLE_BASICS, columns = c("startYear","endYear","runtimeMinutes"), conversion="numeric")

TITLE_EPISODES <- convert_columns(TITLE_EPISODES, columns=c("seasonNumber","episodeNumber"), conversion="numeric")

Task 2: Provided Questions, Provided Answers

Here we seek to answer several questions from the dataset.

#2.1: How many movies are in our data set? How many TV series? How many TV episodes?

# TITLE_BASICS will have this answer
glimpse(TITLE_BASICS)
Rows: 374,231
Columns: 9
$ tconst         <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt…
$ titleType      <chr> "short", "short", "short", "short", "short", "short", "…
$ primaryTitle   <chr> "Carmencita", "Le clown et ses chiens", "Poor Pierrot",…
$ originalTitle  <chr> "Carmencita", "Le clown et ses chiens", "Pauvre Pierrot…
$ isAdult        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ startYear      <dbl> 1894, 1892, 1892, 1892, 1893, 1894, 1894, 1894, 1894, 1…
$ endYear        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ runtimeMinutes <dbl> 1, 5, 5, 12, 1, 1, 1, 1, 45, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
$ genres         <chr> "Documentary,Short", "Animation,Short", "Animation,Come…
table(TITLE_BASICS$titleType)

       movie        short    tvEpisode tvMiniSeries      tvMovie     tvSeries 
      132246        16735       156768         5932        15048        29992 
     tvShort    tvSpecial        video    videoGame 
         411         3064         9345         4690 
# movie, tvEpisode and tvSeries

MOVIE_COUNT <- TITLE_BASICS |>
  filter(`titleType` == "movie") |>
  select(`tconst`) |>
  unique() |>
  count()

sprintf("There are %s movies in the data",format(MOVIE_COUNT,big.mark=",", scientific=FALSE))
[1] "There are 132,246 movies in the data"
TV_SERIES_COUNT <- TITLE_BASICS |>
  filter(`titleType` == "tvSeries") |>
  select(`tconst`) |>
  unique() |>
  count()

sprintf("There are %s TV Series in the data",format(TV_SERIES_COUNT,big.mark=",", scientific=FALSE))
[1] "There are 29,992 TV Series in the data"
TV_EPISODE_COUNT <- TITLE_BASICS |>
  filter(`titleType` == "tvEpisode") |>
  select(`tconst`) |>
  unique() |>
  count()

sprintf("There are %s TV Episodes in the data",format(TV_EPISODE_COUNT,big.mark=",", scientific=FALSE))
[1] "There are 156,768 TV Episodes in the data"
#2.2: Oldest living person in data
glimpse(NAME_BASICS)
Rows: 3,189,865
Columns: 6
$ nconst            <chr> "nm0000001", "nm0000002", "nm0000003", "nm0000004", …
$ primaryName       <chr> "Fred Astaire", "Lauren Bacall", "Brigitte Bardot", …
$ birthYear         <dbl> 1899, 1924, 1934, 1949, 1918, 1915, 1899, 1924, 1925…
$ deathYear         <dbl> 1987, 2014, NA, 1982, 2007, 1982, 1957, 2004, 1984, …
$ primaryProfession <chr> "actor,miscellaneous,producer", "actress,soundtrack,…
$ knownForTitles    <chr> "tt0072308,tt0050419,tt0053137,tt0027125", "tt003738…
OLDEST <- NAME_BASICS |>
  filter(is.na(`deathYear`)) |>
  arrange(`birthYear`) |>
  select(`primaryName`,`birthYear`,`deathYear`) |>
  slice_head(n=10)

OLDEST
         primaryName birthYear deathYear
1     Traudl Lessing      1625        NA
2    Robert De Visée      1655        NA
3     William Sandys      1767        NA
4     Richard Dybeck      1811        NA
5     Albert Monnier      1815        NA
6         C. Hostrup      1818        NA
7     Edouard Martin      1825        NA
8       Gustav Lange      1830        NA
9  William S. Hooser      1836        NA
10     Ion Ivanovici      1845        NA
# Clearly this isn't the way to go about this

glimpse(TITLE_BASICS)
Rows: 374,231
Columns: 9
$ tconst         <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt…
$ titleType      <chr> "short", "short", "short", "short", "short", "short", "…
$ primaryTitle   <chr> "Carmencita", "Le clown et ses chiens", "Poor Pierrot",…
$ originalTitle  <chr> "Carmencita", "Le clown et ses chiens", "Pauvre Pierrot…
$ isAdult        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ startYear      <dbl> 1894, 1892, 1892, 1892, 1893, 1894, 1894, 1894, 1894, 1…
$ endYear        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ runtimeMinutes <dbl> 1, 5, 5, 12, 1, 1, 1, 1, 45, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
$ genres         <chr> "Documentary,Short", "Animation,Short", "Animation,Come…
glimpse(NAME_BASICS)
Rows: 3,189,865
Columns: 6
$ nconst            <chr> "nm0000001", "nm0000002", "nm0000003", "nm0000004", …
$ primaryName       <chr> "Fred Astaire", "Lauren Bacall", "Brigitte Bardot", …
$ birthYear         <dbl> 1899, 1924, 1934, 1949, 1918, 1915, 1899, 1924, 1925…
$ deathYear         <dbl> 1987, 2014, NA, 1982, 2007, 1982, 1957, 2004, 1984, …
$ primaryProfession <chr> "actor,miscellaneous,producer", "actress,soundtrack,…
$ knownForTitles    <chr> "tt0072308,tt0050419,tt0053137,tt0027125", "tt003738…
NAME_BASICS |> separate_longer_delim(knownForTitles, ",") |>
  slice_head(n=5)
     nconst   primaryName birthYear deathYear
1 nm0000001  Fred Astaire      1899      1987
2 nm0000001  Fred Astaire      1899      1987
3 nm0000001  Fred Astaire      1899      1987
4 nm0000001  Fred Astaire      1899      1987
5 nm0000002 Lauren Bacall      1924      2014
                   primaryProfession knownForTitles
1       actor,miscellaneous,producer      tt0072308
2       actor,miscellaneous,producer      tt0050419
3       actor,miscellaneous,producer      tt0053137
4       actor,miscellaneous,producer      tt0027125
5 actress,soundtrack,archive_footage      tt0037382
TITLES_AND_NAMES <- NAME_BASICS |>
  separate_longer_delim(knownForTitles, delim = ",") |>
  inner_join(TITLE_BASICS, by = c("knownForTitles" = "tconst")) |>
  filter(is.na(deathYear),birthYear >= 1908) |> #according to google, the current oldest person alive was born in 1908
  arrange(birthYear) |>
  select(primaryName, birthYear, deathYear) |>
  unique() |>
  slice_head(n = 5)

head(TITLES_AND_NAMES) # Because there are too many NULL deathYear values, this question can't really be answered
     primaryName birthYear deathYear
1  Robert Aisner      1908        NA
2    Ugo Amadoro      1908        NA
3     Hanna Bark      1908        NA
4   Harry Belock      1908        NA
5 Aida Broadbent      1908        NA
#2.3: There is one TV Episode in this data set with a perfect 10/10 rating and at least 200,00 IMDb ratings. What is it? Which series does it belong to?
glimpse(TITLE_RATINGS)
Rows: 374,231
Columns: 3
$ tconst        <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt0…
$ averageRating <dbl> 5.7, 5.6, 6.5, 5.4, 6.2, 5.0, 5.4, 5.4, 5.4, 6.8, 5.2, 7…
$ numVotes      <dbl> 2096, 283, 2104, 183, 2839, 197, 889, 2243, 215, 7728, 4…
HIGHEST_RATED <- TITLE_RATINGS |>
  filter(numVotes >= 200000) |>
  slice_max(order_by = averageRating) |>
  left_join(TITLE_EPISODES, by = c("tconst" = "tconst")) |>
  inner_join(TITLE_BASICS, by = c("tconst" = "tconst")) |>
  left_join(TITLE_BASICS, by = c("parentTconst" = "tconst")) |>
  select(
    filmID = `tconst`,
    seriesName = `primaryTitle.y`,
    episodeName = `primaryTitle.x`,
    `seasonNumber`,
    `episodeNumber`,
    yearAired = `startYear.x`,
    `averageRating`,
    `numVotes`
  )

head(HIGHEST_RATED) # Unsurprisingly, it's a season 5 episode of Breaking Bad, specifically, the one where Hank gets got
     filmID   seriesName episodeName seasonNumber episodeNumber yearAired
1 tt2301451 Breaking Bad  Ozymandias            5            14      2013
  averageRating numVotes
1            10   229933

If anyone’s interested in a famous scene (and subsequent meme) from this episode (Warning: violence and spoilers)

Video Preview

#2.4: Which 4 projects is actor Mark Hamill most known for?

# Just guessing beforehand: Star War IV, V, VI, and VA for the Joker in the Batman animated series

MARK_HAMILL <- NAME_BASICS |>
  filter(primaryName == "Mark Hamill")

MARK_HAMILL # going to guess the record with multiple titles is the correct Mark Hamill
     nconst primaryName birthYear deathYear       primaryProfession
1 nm0000434 Mark Hamill        NA        NA actor,producer,director
                           knownForTitles
1 tt0076759,tt2527336,tt0080684,tt0086190
MARK_HAMILL <- NAME_BASICS |>
  filter(nconst == "nm0000434") |>
  separate_longer_delim(knownForTitles, delim = ",") |>
  inner_join(TITLE_BASICS, by = c("knownForTitles" = "tconst")) |>
  select(actorID = `nconst`,
         `primaryName`,
         `primaryTitle`)

print(MARK_HAMILL)
    actorID primaryName                                   primaryTitle
1 nm0000434 Mark Hamill             Star Wars: Episode IV - A New Hope
2 nm0000434 Mark Hamill        Star Wars: Episode VIII - The Last Jedi
3 nm0000434 Mark Hamill Star Wars: Episode V - The Empire Strikes Back
4 nm0000434 Mark Hamill     Star Wars: Episode VI - Return of the Jedi

Apparently, Star Wars Episode VIII gets a listing before his Joker VA. Hard disagree, but I digress.

Video Preview

#2.5: What TV series, with more than 12 episodes, has the highest average rating?
glimpse(TITLE_EPISODES)
Rows: 3,023,489
Columns: 4
$ tconst        <chr> "tt0045960", "tt0046855", "tt0048378", "tt0048562", "tt0…
$ parentTconst  <chr> "tt0044284", "tt0046643", "tt0047702", "tt0047768", "tt0…
$ seasonNumber  <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 8, 1, 10, 6, 2, 8, 4…
$ episodeNumber <dbl> 3, 4, 6, 10, 4, 20, 5, 2, 20, 6, 2, 3, 2, 10, 17, 5, 1, …
glimpse(TITLE_BASICS)
Rows: 374,231
Columns: 9
$ tconst         <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt…
$ titleType      <chr> "short", "short", "short", "short", "short", "short", "…
$ primaryTitle   <chr> "Carmencita", "Le clown et ses chiens", "Poor Pierrot",…
$ originalTitle  <chr> "Carmencita", "Le clown et ses chiens", "Pauvre Pierrot…
$ isAdult        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ startYear      <dbl> 1894, 1892, 1892, 1892, 1893, 1894, 1894, 1894, 1894, 1…
$ endYear        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ runtimeMinutes <dbl> 1, 5, 5, 12, 1, 1, 1, 1, 45, 1, 1, 1, 1, 1, 2, 1, 1, 1,…
$ genres         <chr> "Documentary,Short", "Animation,Short", "Animation,Come…
glimpse(TITLE_RATINGS)
Rows: 374,231
Columns: 3
$ tconst        <chr> "tt0000001", "tt0000002", "tt0000003", "tt0000004", "tt0…
$ averageRating <dbl> 5.7, 5.6, 6.5, 5.4, 6.2, 5.0, 5.4, 5.4, 5.4, 6.8, 5.2, 7…
$ numVotes      <dbl> 2096, 283, 2104, 183, 2839, 197, 889, 2243, 215, 7728, 4…
TWELVE_EP_SERIES <- TITLE_EPISODES |>
  group_by(parentTconst) |>
  summarize(episodeCount = n()) |>
  filter(episodeCount >= 12) |>
  inner_join(TITLE_RATINGS,c("parentTconst" = "tconst")) |>
  left_join(TITLE_BASICS,c("parentTconst" = "tconst")) |>
  select(
    seriesID = `parentTconst`,
    `primaryTitle`,
    `startYear`,
    `endYear`,
    `episodeCount`,
    `averageRating`,
    `numVotes`
  ) |>
  arrange(desc(averageRating), desc(episodeCount), desc(numVotes)) |>
  slice_head(n=5)


TWELVE_EP_SERIES
# A tibble: 5 × 7
  seriesID   primaryTitle  startYear endYear episodeCount averageRating numVotes
  <chr>      <chr>             <dbl>   <dbl>        <int>         <dbl>    <dbl>
1 tt15613780 Craft Games        2014      NA          318           9.7      150
2 tt8560108  Prime Time         2016      NA          212           9.7      181
3 tt21989170 Gumbino            2016      NA          148           9.7      343
4 tt7151672  Choufli Hal        2005    2009          134           9.7     2930
5 tt23028046 Jogandofodda…      2019    2021          101           9.7      168

No clue what these shows are

#2.6: The TV Series Happy Days (1974-1984) gives us the common idiom "jump the shark". The phrase comes from
# a controversial fifth season episode (aired in 1977) in which a lead character literally jumped over a shark on
# water skis. Idiomatically, it is used to refer to the moment when a once-great show becomes ridiculous and
# rapidly loses quality. Is it true that episodes from later seasons of Happy Days have lower
# average ratings than the early seasons?

# First find the ID for Happy Days

TITLE_BASICS |> filter(primaryTitle == "Happy Days",startYear==1974,endYear==1984) |> select(`tconst`,`titleType`,`primaryTitle`)
     tconst titleType primaryTitle
1 tt0070992  tvSeries   Happy Days
glimpse(TITLE_EPISODES)
Rows: 3,023,489
Columns: 4
$ tconst        <chr> "tt0045960", "tt0046855", "tt0048378", "tt0048562", "tt0…
$ parentTconst  <chr> "tt0044284", "tt0046643", "tt0047702", "tt0047768", "tt0…
$ seasonNumber  <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 8, 1, 10, 6, 2, 8, 4…
$ episodeNumber <dbl> 3, 4, 6, 10, 4, 20, 5, 2, 20, 6, 2, 3, 2, 10, 17, 5, 1, …
HAPPY_DAYS <- TITLE_EPISODES |>
  filter(parentTconst=="tt0070992") |>
  left_join(TITLE_RATINGS,c("tconst" = "tconst")) |>
  select(
    -`numVotes`,
    -`parentTconst`,
    -`tconst`
    ) |>
  group_by(seasonNumber) |>
  summarize(avgRating = mean(averageRating,na.rm = TRUE))

HAPPY_DAYS
# A tibble: 11 × 2
   seasonNumber avgRating
          <dbl>     <dbl>
 1            1      7.58
 2            2      7.69
 3            3      7.7 
 4            4      7.43
 5            5      7   
 6            6      7.02
 7            7      6.33
 8            8      5.3 
 9            9      6.4 
10           10      6.7 
11           11      7.33
min_point <- HAPPY_DAYS[which.min(HAPPY_DAYS$avgRating), ]
max_point <- HAPPY_DAYS[which.max(HAPPY_DAYS$avgRating), ]
sharkjump <- HAPPY_DAYS[HAPPY_DAYS$seasonNumber == 5, ]

highlighted_points <- factor(c("Lowest Rating", "Highest Rating", "Jumped Shark"),
                             levels = c("Lowest Rating", "Highest Rating", "Jumped Shark"))  # Set the order here


# Plot ratings over time
ggplot(HAPPY_DAYS, aes(x = seasonNumber, y = avgRating)) + 
  geom_line(size=1) +
  geom_point(color = "#D35400",size=2) +
  
  # Add custom points for min, max, and season 5
  geom_point(data = min_point, aes(color = highlighted_points[1]), size = 3) +
  geom_point(data = max_point, aes(color = highlighted_points[2]), size = 3) +
  geom_point(data = sharkjump, aes(color = highlighted_points[3]), size = 3) +
  
  # Add labels and formatting
  labs(title = "Happy Days Average Rating by Season", x = "Season", y = "Average Rating") +
  
  # Customize ticks for x and y axes
  scale_x_continuous(breaks = seq(min(HAPPY_DAYS$seasonNumber), max(HAPPY_DAYS$seasonNumber), by = 1)) +
  scale_y_continuous(labels = label_number(), breaks = seq(floor(min(HAPPY_DAYS$avgRating)), ceiling(max(HAPPY_DAYS$avgRating)), by = 0.5)) +
  
  # Manually define colors for the highlighted points
  scale_color_manual(
    values = c("Lowest Rating" = "blue", "Highest Rating" = "red", "Jumped Shark" = "green"),
    name = ""
  ) +
  
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"), 
    axis.title.x = element_text(size = 14),  
    axis.title.y = element_text(size = 14),  
    axis.text = element_text(size = 12),  
    # panel.grid.minor = element_blank(),  
    legend.position = "right",  # Position the legend on the right
    legend.text = element_text(face = "bold", size = 8),
    
    # Set the background colors
    panel.background = element_rect(fill = "gray90"),  # Gray background for the plot area
    plot.background = element_rect(fill = "gray90"),  # Gray background for the entire plot
    
    # Enhance axis lines
    axis.line = element_line(color = "black", size = 1.2),  # Change color and thickness of axis lines
    axis.ticks = element_line(color = "black", size = 1),  # Change color and thickness of tick marks
    axis.ticks.length = unit(0.25, "cm"),  # Adjust length of tick marks
    
    # Enhance gridlines
    panel.grid.major = element_line(color = "gray60", size = 0.5),  # Change color and thickness of major gridlines
    panel.grid.minor = element_line(color = "gray60", size = 0.5)   # Change color and thickness of minor gridlines
  )

As we can see, the shark-jump likely caused an immediate dip in viewership, but it rebounded for the final 3 seasons and went out near the height of its popularity

Task 3: What is success?

Here we create a “success” metric to determine whether or not a film meets our own standards.

First let’s define success. We want to look at two features: * Average Rating * Num Votes Other features of note could be * Crew Size * Run Time

# First let's just get only movie data from 1960 onwards
MOVIES <- TITLE_BASICS |>
  filter(titleType == "movie",startYear >= 1960)

# Add in ratings
MOVIES <- MOVIES |>
  left_join(TITLE_RATINGS, c("tconst" = "tconst")) |>
  select(
    `tconst`,
    `primaryTitle`,
    `isAdult`,
    releaseYear = `startYear`,
    `runtimeMinutes`,
    `genres`,
    `averageRating`,
    `numVotes`
  )

# Add in crew size
CREWS <- TITLE_PRINCIPALS |>
  group_by(tconst) |>
  summarize(castCount = n())

MOVIES <- MOVIES |>
  left_join(CREWS, c("tconst" = "tconst"))

# Count the number of genres in the 'genres' column
GENRE_COUNT <- MOVIES |>
  separate_longer_delim(genres, delim = ",") |>  # Split into multiple rows by delimiter
  group_by(tconst) |>                               # Group by title to keep track of original rows
  summarise(genre_count = n(), .groups = 'drop')  # Count the number of genres


# Join back to the original MOVIES dataframe if needed
MOVIES <- MOVIES |>
  left_join(GENRE_COUNT, by = "tconst")

# summary stats
describe(MOVIES)
               vars      n     mean       sd  median  trimmed      mad  min
tconst*           1 118505 59253.00 34209.59 59253.0 59253.00 43923.51    1
primaryTitle*     2 118505 53767.12 30890.58 53903.0 53812.98 39907.14    1
isAdult           3 118505     0.01     0.09     0.0     0.00     0.00    0
releaseYear       4 118505  2004.03    16.86  2010.0  2006.16    13.34 1960
runtimeMinutes    5 116399   101.65   202.99    96.0    98.24    14.83   17
genres*           6 118505   579.73   274.72   633.0   599.44   201.63    1
averageRating     7 118505     5.88     1.32     6.1     5.95     1.19    1
numVotes          8 118505  9392.56 58913.55   473.0  1139.48   498.15  100
castCount         9 114101    18.47     4.11    19.0    18.70     2.97    1
genre_count      10 118505     2.00     0.83     2.0     2.00     1.48    1
                   max   range   skew kurtosis     se
tconst*         118505  118504   0.00    -1.20  99.38
primaryTitle*   107056  107055  -0.01    -1.21  89.73
isAdult              1       1  10.77   113.99   0.00
releaseYear       2024      64  -0.94    -0.17   0.05
runtimeMinutes   51420   51403 224.22 52765.55   0.59
genres*           1010    1009  -0.66    -0.48   0.80
averageRating       10       9  -0.52     0.17   0.00
numVotes       2952383 2952283  17.20   456.62 171.14
castCount           57      56  -0.44     2.86   0.01
genre_count          3       2   0.00    -1.55   0.00
# 90% of movies fall between an average rating of 3.4 and 7.7
quantile(MOVIES$averageRating,.05)
 5% 
3.4 
quantile(MOVIES$averageRating,.95)
95% 
7.7 
# We can use this to determine both what makes a great move and what makes a flop

FLOPS <- MOVIES |>
  filter(averageRating <= 3.4)

SUCCESSES <- MOVIES |>
  filter(averageRating >= 7.7)

describe(FLOPS)
               vars    n    mean      sd median trimmed     mad  min      max
tconst*           1 6162 3081.50 1778.96 3081.5 3081.50 2283.95    1   6162.0
primaryTitle*     2 6162 3055.68 1762.50 3054.5 3056.01 2261.71    1   6107.0
isAdult           3 6162    0.00    0.05    0.0    0.00    0.00    0      1.0
releaseYear       4 6162 2010.13   12.33 2013.0 2012.26    8.90 1960   2024.0
runtimeMinutes    5 5952   98.85  559.05   89.0   89.42   10.38   26  43200.0
genres*           6 6162  226.28  128.67  239.0  233.78  170.50    1    396.0
averageRating     7 6162    2.82    0.52    2.9    2.88    0.44    1      3.4
numVotes          8 6162 1333.77 6306.58  334.0  468.67  269.83  100 180239.0
castCount         9 5886   18.30    3.86   18.0   18.36    2.97    1     49.0
genre_count      10 6162    1.94    0.84    2.0    1.92    1.48    1      3.0
                  range  skew kurtosis    se
tconst*          6161.0  0.00    -1.20 22.66
primaryTitle*    6106.0  0.00    -1.20 22.45
isAdult             1.0 20.90   435.00  0.00
releaseYear        64.0 -1.67     2.81  0.16
runtimeMinutes  43174.0 76.99  5933.03  7.25
genres*           395.0 -0.36    -1.27  1.64
averageRating       2.4 -1.09     0.74  0.01
numVotes       180139.0 14.03   256.60 80.34
castCount          48.0  0.03     3.71  0.05
genre_count         2.0  0.12    -1.58  0.01
describe(SUCCESSES)
               vars    n     mean        sd median trimmed     mad    min
tconst*           1 7224  3612.50   2085.53 3612.5 3612.50 2677.58    1.0
primaryTitle*     2 7224  3574.23   2063.38 3573.5 3573.97 2651.63    1.0
isAdult           3 7224     0.00      0.05    0.0    0.00    0.00    0.0
releaseYear       4 7224  2004.91     17.78 2011.0 2007.24   14.83 1960.0
runtimeMinutes    5 6930   115.03     42.59  109.0  111.80   31.13   17.0
genres*           6 7224   261.63    111.20  295.0  272.18   94.89    1.0
averageRating     7 7224     8.10      0.42    8.0    8.03    0.30    7.7
numVotes          8 7224 44528.35 188879.33  699.0 3525.70  856.94  100.0
castCount         9 6892    16.43      5.12   17.0   16.71    4.45    1.0
genre_count      10 7224     1.90      0.84    2.0    1.88    1.48    1.0
                   max     range  skew kurtosis      se
tconst*           7224    7223.0  0.00    -1.20   24.54
primaryTitle*     7150    7149.0  0.00    -1.20   24.28
isAdult              1       1.0 21.17   446.38    0.00
releaseYear       2024      64.0 -0.93    -0.26    0.21
runtimeMinutes    1440    1423.0  6.95   157.15    0.51
genres*            446     445.0 -0.79    -0.15    1.31
averageRating       10       2.3  1.51     2.30    0.00
numVotes       2952383 2952283.0  6.83    59.67 2222.26
castCount           38      37.0 -0.43     0.53    0.06
genre_count          3       2.0  0.18    -1.55    0.01
# Successes seem to be longer on average but also confined to a range of 1,440 minutes runtime while flops get a little silly
# A lot more people vote for successes (surprising because you'd think people are more likely to respond if they dislike something than if they like it)
# https://www.nationalstrategic.com/why-would-they-write-that-the-psychology-of-customer-reviews/

# The number of genres used seem to not matter pretty much at all

Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

Mode(SUCCESSES$genres)
[1] "Documentary"
Mode(FLOPS$genres)
[1] "Horror"
head(sort(desc(table(SUCCESSES$genres))),n=20)

                  Documentary                         Drama 
                        -1083                         -1040 
                       Comedy                  Comedy,Drama 
                         -289                          -248 
            Documentary,Music                 Drama,Romance 
                         -241                          -205 
        Biography,Documentary          Comedy,Drama,Romance 
                         -119                          -112 
           Action,Crime,Drama             Documentary,Sport 
                         -105                          -105 
          Documentary,History                      Thriller 
                          -96                           -90 
         Crime,Drama,Thriller   Biography,Documentary,Music 
                          -81                           -79 
               Drama,Thriller                   Crime,Drama 
                          -79                           -77 
                 Drama,Family                           \\N 
                          -76                           -70 
Biography,Documentary,History                     Drama,War 
                          -66                           -65 
head(sort(desc(table(FLOPS$genres))),n=20)

                 Horror                  Comedy         Horror,Thriller 
                   -956                    -598                    -334 
                  Drama                Thriller                  Action 
                   -199                    -156                    -150 
          Comedy,Horror            Comedy,Drama Horror,Mystery,Thriller 
                   -135                     -88                     -87 
          Horror,Sci-Fi                  Sci-Fi Action,Adventure,Comedy 
                    -84                     -84                     -76 
   Drama,Horror,Mystery           Action,Sci-Fi          Comedy,Romance 
                    -74                     -73                     -73 
  Drama,Horror,Thriller      Action,Crime,Drama             Documentary 
                    -70                     -67                     -65 
           Action,Drama         Action,Thriller 
                    -64                     -62 
table(SUCCESSES$isAdult)

   0    1 
7208   16 
table(FLOPS$isAdult)

   0    1 
6148   14 
# no effect

Documentaries and Dramas are successfull while Horror is not. Comedy can go either way.

Metrics for success: * Good genre (Drama, Documentary) * High number of votes (> 500) * Runtime between 70 minutes and 150 minutes. Dock points for each STD outside of that Metrics for flop: * Bad genre (Horror, Comedy) * Low number of votes (<= 500) * Runtime greater than 200 minutes

# Define the metric for success function
metric_for_success <- function(df) {
  df <- df |>
    mutate(
      # Calculate positive contributions
      positive_metric = ifelse(str_detect(genres, "Drama|Documentary"), 0.2, 0) +
        ifelse(numVotes >= 500, 0.4, 0) +
        ifelse(runtimeMinutes >= 70 & runtimeMinutes <= 150, 0.4, 0),
      
      # Calculate negative contributions
      negative_metric = ifelse(str_detect(genres, "Horror|Comedy"), -0.4, 0) +
        ifelse(numVotes < 500 | is.na(numVotes), -0.3, 0) +
        ifelse(runtimeMinutes > 200, -pmin(0.5, (runtimeMinutes - 200) / 90), 0),
      
      # Combine the positive and negative contributions
      metric = positive_metric + negative_metric,
      
      # Bound the metric between -1 and 1
      metric = pmin(pmax(metric, -1), 1),
      
      # Round the metric to 3 decimal places
      metric = round(metric, 3)
    ) |>
    # Drop the intermediate columns (optional)
    select(-positive_metric, -negative_metric)
  
  return(df)
}

#Apply the metric function to the DataFrame
MOVIES <- metric_for_success(MOVIES)

table(MOVIES$metric)

    -1 -0.944   -0.8 -0.767 -0.722   -0.7 -0.611   -0.6 -0.567 -0.544 -0.533 
     9      1      8      1      1    642      1     59      1      5      1 
  -0.5 -0.478 -0.467 -0.444 -0.433 -0.411   -0.4 -0.389 -0.378 -0.367 -0.356 
   293      2      1      1      1      2      3      1      2      2      3 
-0.344 -0.333 -0.322 -0.311   -0.3 -0.289 -0.278 -0.267 -0.256 -0.244 -0.233 
     1      2      5      2  15056      1      2      2      1      1      1 
-0.222 -0.211 -0.189 -0.167 -0.144 -0.122 -0.111   -0.1 -0.067 -0.056 -0.033 
     1      9      3      2      1      1      5   8398      2      1      1 
-0.022      0  0.022  0.067    0.1  0.111  0.133  0.156  0.167  0.178  0.189 
     1    420      1      1   7901      1      2      4      1      4      2 
   0.2  0.211  0.233  0.244  0.256  0.267  0.278    0.3  0.311  0.322  0.333 
   467      1      1      2      1      2      1  26557      2      2      1 
 0.344  0.367  0.378  0.389    0.4  0.411  0.422  0.433  0.444  0.456  0.489 
     1      1      6      2  16669      4      2      2      2      3      6 
   0.5  0.511  0.522  0.533  0.544  0.556  0.567  0.578  0.589    0.6    0.8 
     1      5      2      1      2      1      1      4      3  10527   6349 
     1 
 22892 
#3.1: Choose the top 5-10 movies by my metric to confirm successes
MOVIES |>
  filter(metric==1) |>
  select(
    -`isAdult`,
    -`tconst`,
    -`genre_count`
  ) |>
  sample_n(10)
           primaryTitle releaseYear runtimeMinutes                  genres
1      The Last Lullaby        2008             93           Drama,Mystery
2                  Leto        2018            126 Biography,Drama,History
3       A Girl Returned        2021            110                   Drama
4             Intrusion        2021             92    Action,Drama,Mystery
5  Night of the Juggler        1980            101      Action,Crime,Drama
6    Children of Heaven        1997             89      Drama,Family,Sport
7                 Sunny        2021             93    Crime,Drama,Thriller
8                 Close        2022            104                   Drama
9               Lebanon        2009             93               Drama,War
10 Farewell to the King        1989            117  Action,Adventure,Drama
   averageRating numVotes castCount metric
1            6.1     1023        20      1
2            7.3     8076        24      1
3            6.9      500        21      1
4            5.3    21930        24      1
5            6.5     1128        20      1
6            8.2    82639        17      1
7            6.7     6429        16      1
8            7.8    39014        NA      1
9            6.9    12060        22      1
10           6.2     3176        22      1
# This metric works really well in conjunction with a high number of votes
MOVIES |>
  arrange(desc(metric),desc(numVotes)) |>
  slice_head(n=10)
      tconst             primaryTitle isAdult releaseYear runtimeMinutes
1  tt0111161 The Shawshank Redemption       0        1994            142
2  tt0137523               Fight Club       0        1999            139
3  tt0109830             Forrest Gump       0        1994            142
4  tt0114369                    Se7en       0        1995            127
5  tt0372784            Batman Begins       0        2005            140
6  tt0102926 The Silence of the Lambs       0        1991            118
7  tt7286456                    Joker       0        2019            122
8  tt1130884           Shutter Island       0        2010            138
9  tt0482571             The Prestige       0        2006            130
10 tt0099685               Goodfellas       0        1990            145
                   genres averageRating numVotes castCount genre_count metric
1                   Drama           9.3  2952383        19           1      1
2                   Drama           8.8  2384372        23           1      1
3           Drama,Romance           8.8  2309923        21           2      1
4     Crime,Drama,Mystery           8.6  1843387        21           3      1
5      Action,Crime,Drama           8.2  1607729        26           3      1
6    Crime,Drama,Thriller           8.6  1583671        21           3      1
7    Crime,Drama,Thriller           8.4  1548918        23           3      1
8  Drama,Mystery,Thriller           8.2  1496563        21           3      1
9    Drama,Mystery,Sci-Fi           8.5  1471339        22           3      1
10  Biography,Crime,Drama           8.7  1285009        19           3      1
#3.2: Choose 3-5  movies with large numbers of IMDb votes that socre poorly on your success metric and confirm
# that they are indeed low quality
MOVIES |>
  arrange(metric, desc(numVotes)) |>
  select(`primaryTitle`,
         `releaseYear`,
         `numVotes`,
         `averageRating`,
         `genres`,
         `metric`) |>
  slice_head(n=5)
                               primaryTitle releaseYear numVotes averageRating
1                            Out 1: Spectre        1972      437           7.2
2 Three Mirrors Creature's Flashes of Flesh        2023      422           7.8
3                       The Cremaster Cycle        2003      306           6.9
4                         Haripada Bandwala        2016      290           4.4
5                            The Movie Orgy        2009      211           7.8
                      genres metric
1      Comedy,Drama,Thriller     -1
2        Drama,Horror,Sci-Fi     -1
3       Drama,Fantasy,Horror     -1
4             Comedy,Romance     -1
5 Comedy,Documentary,History     -1

I didn’t need a personal metric to tell me that these 5 movies suck

#3.3 Choose a prestige actor or director and confirm that they have many projects with high scores on your success metric.
NAME_BASICS |>
  filter(primaryName == "Stanley Kubrick") |>
  separate_longer_delim(knownForTitles, delim = ",") |>
  inner_join(MOVIES, by = c("knownForTitles" = "tconst")) |>
  select(
    `primaryName`,
    `primaryTitle`,
    `averageRating`,
    `numVotes`,
    `genres`,
    `runtimeMinutes`,
    `metric`
         )
      primaryName          primaryTitle averageRating numVotes
1 Stanley Kubrick 2001: A Space Odyssey           8.3   733226
2 Stanley Kubrick    A Clockwork Orange           8.2   894278
3 Stanley Kubrick          Barry Lyndon           8.1   187061
4 Stanley Kubrick     Full Metal Jacket           8.2   804176
               genres runtimeMinutes metric
1    Adventure,Sci-Fi            149    0.8
2        Crime,Sci-Fi            136    0.8
3 Adventure,Drama,War            185    0.6
4           Drama,War            116    1.0

Pretty good

#3.4 Perform at least one other "spot check" validation

counts <- MOVIES %>%
  summarize(
    above_7_5_high_metric = sum(averageRating > 7.5 & metric >= 0.6, na.rm = TRUE),
    between_5_and_7_5_high_metric = sum(averageRating >= 5 & averageRating <= 7.5 & metric >= 0.6, na.rm = TRUE),
    below_5_high_metric = sum(averageRating < 5 & metric >= 0.6, na.rm = TRUE),
    above_5_and_7_5_low_metric = sum(averageRating > 7.5 & metric <= 0.3, na.rm = TRUE),
    between_5_and_7_5_low_metric = sum(averageRating >= 5 & averageRating <= 7.5 & metric <= 0.3, na.rm = TRUE),
    below_5_low_metric = sum(averageRating < 5 & metric <= 0.3, na.rm = TRUE)
  )

counts
  above_7_5_high_metric between_5_and_7_5_high_metric below_5_high_metric
1                  4229                         30767                4772
  above_5_and_7_5_low_metric between_5_and_7_5_low_metric below_5_low_metric
1                       3766                        39965              16177

Although the metric is imperfect, it is pretty good at telling us what’s good and what’s not (based on average ratings)

#3.5: Come up with a numerical threshold for a project to be a ‘success’; that is, determine a value such that movies above `v` are all “solid” or better.

v <- 0.6

Task 4: What makes a good movie?

# Juuuust incase
avgrtgthrsh <- 6.0


# Create a new column indicating success based on averageRating
MOVIES <- MOVIES |>
  mutate(success_averageRating = ifelse(averageRating >= avgrtgthrsh, 1, 0),
         success_metric = ifelse(metric >= v, 1, 0)) |>
  mutate(decade = floor(releaseYear / 10) * 10)

MOVIES_GENRES <- MOVIES |>
  filter(genres != "\\N", averageRating != "\\N") |>
  separate_longer_delim(genres, delim = ",") |>
  mutate(genres = trimws(genres)) |>
  mutate(decade = floor(releaseYear / 10) * 10)

# Calculate total films by genre and decade
total_films_by_genre <- MOVIES_GENRES |>
  group_by(decade, genres) |>
  summarize(total_count = n(), .groups = "drop")

# Filter genres by count
popular_genres <- total_films_by_genre |>
  filter(total_count > 5000) |>
  select(genres) |>
  distinct()

# Count successes for averageRating (for popular genres)
success_by_genre_rating_popular <- MOVIES_GENRES |>
  filter(genres %in% popular_genres$genres) |>
  group_by(decade, genres) |>
  summarize(success_count = sum(success_averageRating, na.rm = TRUE), .groups = "drop") |>
  left_join(total_films_by_genre, by = c("decade", "genres")) |>
  mutate(success_rate = success_count / total_count * 100) |>
  arrange(decade, desc(success_rate))

# Count successes for metric (for popular genres)
success_by_genre_metric_popular <- MOVIES_GENRES |>
  filter(genres %in% popular_genres$genres) |>
  group_by(decade, genres) |>
  summarize(success_count = sum(success_metric, na.rm = TRUE), .groups = "drop") |>
  left_join(total_films_by_genre, by = c("decade", "genres")) |>
  mutate(success_rate = success_count / total_count * 100) |>
  arrange(decade, desc(success_rate))

# Count successes for averageRating (for less popular genres)
success_by_genre_rating_less <- MOVIES_GENRES |>
  filter(!(genres %in% popular_genres$genres)) |>
  group_by(decade, genres) |>
  summarize(success_count = sum(success_averageRating, na.rm = TRUE), .groups = "drop") |>
  left_join(total_films_by_genre, by = c("decade", "genres")) |>
  mutate(success_rate = success_count / total_count * 100) |>
  arrange(decade, desc(success_rate))

# Count successes for metric (for less popular genres)
success_by_genre_metric_less <- MOVIES_GENRES |>
  filter(!(genres %in% popular_genres$genres)) |>
  group_by(decade, genres) |>
  summarize(success_count = sum(success_metric, na.rm = TRUE), .groups = "drop") |>
  left_join(total_films_by_genre, by = c("decade", "genres")) |>
  mutate(success_rate = success_count / total_count * 100) |>
  arrange(decade, desc(success_rate))

# Plotting Success Rates for Average Rating (Popular Genres)
p1 <- ggplot(success_by_genre_rating_popular, aes(x = decade, y = success_rate, color = genres, group = genres)) +
  geom_line(size = 1) +  # Add line
  geom_point(size = 3) +  # Add points for clarity
  labs(title = "Success Rates by Genre (Average Rating - Popular Genres)", x = "Decade", y = "Success Rate (%)") +
  scale_y_continuous(limits = c(0, 100)) +  # Set y-axis limits from 0 to 100
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels
    plot.title = element_text(hjust = 0.5)  # Center the title
  )

# Convert ggplot to plotly for hover functionality
p1_interactive <- ggplotly(p1)

# Plotting Success Rates for Metric (Popular Genres)
p2 <- ggplot(success_by_genre_metric_popular, aes(x = decade, y = success_rate, color = genres, group = genres)) +
  geom_line(size = 1) +  # Add line
  geom_point(size = 3) +  # Add points for clarity
  labs(title = "Success Rates by Genre (Metric - Popular Genres)", x = "Decade", y = "Success Rate (%)") +
  scale_y_continuous(limits = c(0, 100)) +  # Set y-axis limits from 0 to 100
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels
    plot.title = element_text(hjust = 0.5)  # Center the title
  )

# Convert ggplot to plotly for hover functionality
p2_interactive <- ggplotly(p2)

# Plotting Success Rates for Average Rating (Less Popular Genres)
p3 <- ggplot(success_by_genre_rating_less, aes(x = decade, y = success_rate, color = genres, group = genres)) +
  geom_line(size = 1) +  # Add line
  geom_point(size = 3) +  # Add points for clarity
  labs(title = "Success Rates by Genre (Average Rating - Less Popular Genres)", x = "Decade", y = "Success Rate (%)") +
  scale_y_continuous(limits = c(0, 100)) +  # Set y-axis limits from 0 to 100
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels
    plot.title = element_text(hjust = 0.5)  # Center the title
  )

# Convert ggplot to plotly for hover functionality
p3_interactive <- ggplotly(p3)

# Plotting Success Rates for Metric (Less Popular Genres)
p4 <- ggplot(success_by_genre_metric_less, aes(x = decade, y = success_rate, color = genres, group = genres)) +
  geom_line(size = 1) +  # Add line
  geom_point(size = 3) +  # Add points for clarity
  labs(title = "Success Rates by Genre (Metric - Less Popular Genres)", x = "Decade", y = "Success Rate (%)") +
  scale_y_continuous(limits = c(0, 100)) +  # Set y-axis limits from 0 to 100
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels
    plot.title = element_text(hjust = 0.5)  # Center the title
  )

# Convert ggplot to plotly for hover functionality
p4_interactive <- ggplotly(p4)
p1_interactive
p2_interactive
p3_interactive
p4_interactive

They’re interactive!

#4.1: Which genre has the most "successs" in each decade"?


# Count successes for averageRating
success_by_genre_rating <- MOVIES_GENRES |>
  group_by(decade, genres) |>
  summarize(success_count = sum(success_averageRating, na.rm = TRUE), .groups = "drop") |>
  arrange(decade, desc(success_count))

# Count successes for metric
success_by_genre_metric <- MOVIES_GENRES |>
  group_by(decade, genres) |>
  summarize(success_count = sum(success_metric, na.rm = TRUE), .groups = "drop") |>
  arrange(decade, desc(success_count))

# Identify the genre with the most successes for each decade (Average Rating)
most_successful_genre_rating <- success_by_genre_rating |>
  group_by(decade) |>
  slice_max(success_count, n = 2) |>
  ungroup()

# Identify the genre with the most successes for each decade (Metric)
most_successful_genre_metric <- success_by_genre_metric |>
  group_by(decade) |>
  slice_max(success_count, n = 2) |>
  ungroup()

most_successful_genre_rating
# A tibble: 14 × 3
   decade genres      success_count
    <dbl> <chr>               <dbl>
 1   1960 Drama                2567
 2   1960 Comedy               1287
 3   1970 Drama                2990
 4   1970 Comedy               1318
 5   1980 Drama                3220
 6   1980 Comedy               1783
 7   1990 Drama                4170
 8   1990 Comedy               2184
 9   2000 Drama                7339
10   2000 Comedy               3325
11   2010 Drama               11449
12   2010 Documentary          5084
13   2020 Drama                5103
14   2020 Comedy               2174
most_successful_genre_metric
# A tibble: 15 × 3
   decade genres    success_count
    <dbl> <chr>             <dbl>
 1   1960 Drama              1526
 2   1960 Adventure           308
 3   1960 Romance             308
 4   1970 Drama              1890
 5   1970 Crime               517
 6   1980 Drama              2083
 7   1980 Action              600
 8   1990 Drama              3258
 9   1990 Comedy              888
10   2000 Drama              6342
11   2000 Comedy             1711
12   2010 Drama             10358
13   2010 Action             2553
14   2020 Drama              4501
15   2020 Thriller           1319

Drama and Action films are pretty succesfull!

#4.2 Which genre consistently has the most "successes"? Which genre used to reliably produce "successes" and has fallen out of favor?
p1_interactive
p2_interactive

Documentaries have the most consistent successes.

Horror used to produce a lot of successes but has been on a downward trend each decade.

#4.3: What genre has produced the most “successes” since 2010?
#Does it have the highest success rate or does it only have a large number of successes because there are many productions in that genre?
p1_interactive
p2_interactive
most_successful_genre_rating
# A tibble: 14 × 3
   decade genres      success_count
    <dbl> <chr>               <dbl>
 1   1960 Drama                2567
 2   1960 Comedy               1287
 3   1970 Drama                2990
 4   1970 Comedy               1318
 5   1980 Drama                3220
 6   1980 Comedy               1783
 7   1990 Drama                4170
 8   1990 Comedy               2184
 9   2000 Drama                7339
10   2000 Comedy               3325
11   2010 Drama               11449
12   2010 Documentary          5084
13   2020 Drama                5103
14   2020 Comedy               2174
most_successful_genre_metric
# A tibble: 15 × 3
   decade genres    success_count
    <dbl> <chr>             <dbl>
 1   1960 Drama              1526
 2   1960 Adventure           308
 3   1960 Romance             308
 4   1970 Drama              1890
 5   1970 Crime               517
 6   1980 Drama              2083
 7   1980 Action              600
 8   1990 Drama              3258
 9   1990 Comedy              888
10   2000 Drama              6342
11   2000 Comedy             1711
12   2010 Drama             10358
13   2010 Action             2553
14   2020 Drama              4501
15   2020 Thriller           1319

Drama seems to be the answer to both of these

#4.4: What genre has become more popular in recent years?

p3_interactive
p4_interactive

Action movies

Task 5: Identifying the Crew

Hang on to your hats because there’s about to be a lot of data manipulation.

Basically, we’re going to whittle down the list of actors and directors until we find a group that matches what we’re looking for in producing the next box-office smash hit.

#Filter TITLE_BASICS based on metric threshold and genre
FILTERED_MOVIES <- MOVIES |>
  separate_longer_delim(genres, delim = ",") |>
  filter(success_metric == 1, genres %in% c("Action", "Drama", "Thriller", "Documentary")) |>
  group_by(tconst) |>
  summarize(
    title = first(primaryTitle), # Adjust according to your dataset
    year = first(releaseYear), # Adjust according to your dataset
    metric = first(metric), # Adjust according to your dataset,
    avgRating = first(averageRating),
    numVotes,
    decade = first(decade),
    castCount = first(castCount),
    genres = paste(unique(genres), collapse = ", ") # Combine genres into one string
  )


ACTORS <- NAME_BASICS |>
  filter(grepl("actor", primaryProfession, ignore.case = TRUE), is.na(deathYear)) |>
  separate_longer_delim(primaryProfession, delim = ",") |>
  filter(primaryProfession == "actor") |>
  mutate(age = 2024 - birthYear) |>
  select(-`deathYear`)

# Joining with TITLE_PRINCIPALS
ACTORS_TITLES <- ACTORS |>
  inner_join(TITLE_PRINCIPALS, by = c("nconst" = "nconst")) |>
  left_join(TITLE_BASICS, by = c("tconst" = "tconst")) |>
  inner_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
  select(
    `nconst`,
    `primaryName`,
    `age`,
    `tconst`,
    `primaryTitle`,
    releaseYear = `startYear`,
    `decade`,
    `castCount`,
    `runtimeMinutes`,
    genres = `genres.x`,
    mainGenre = `genres.y`,
    `metric`,
    `avgRating`
  ) |>
  distinct()

DIRECTORS <- NAME_BASICS |>
  filter(grepl("director", primaryProfession, ignore.case = TRUE), is.na(deathYear)) |>
  separate_longer_delim(primaryProfession, delim = ",") |>
  filter(primaryProfession == "director") |>
  mutate(age = 2024 - birthYear) |>
  select(-`deathYear`)


# Joining with TITLE_PRINCIPALS
DIRECTORS_TITLES <- DIRECTORS |>
  inner_join(TITLE_PRINCIPALS, by = c("nconst" = "nconst")) |>
  left_join(TITLE_BASICS, by = c("tconst" = "tconst")) |>
  inner_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
  select(
    `nconst`,
    `primaryName`,
    `age`,
    `tconst`,
    `primaryTitle`,
    releaseYear = `startYear`,
    `decade`,
    `castCount`,
    `runtimeMinutes`,
    genres = `genres.x`,
    mainGenre = `genres.y`,
    `metric`,
    `avgRating`
  ) |>
  distinct()

# Let's get anyone who's been in a film since at least 2015
LAST_TEN_YEARS_ACTORS <- ACTORS_TITLES |>
  filter(releaseYear >= 2015)

LAST_TEN_YEARS_DIRECTORS <- DIRECTORS_TITLES |>
  filter(releaseYear >= 2015)

# since Action and Drama are the best genres, let's subset to those

BEST_GENRES_ACTORS <- LAST_TEN_YEARS_ACTORS |>
  filter(mainGenre %in% c("Action", "Drama"))

BEST_GENRES_DIRECTORS <- LAST_TEN_YEARS_DIRECTORS |>
  filter(mainGenre %in% c("Action", "Drama"))

# For actors, let's get people who can work with big supporting casts. 
# We don't know if that's what we want, but it'll be nice to have

cast_threshold <- BEST_GENRES_ACTORS |>
  summarize(threshold = quantile(castCount, 0.95, na.rm = TRUE)) |>
  pull(threshold)

  
ACTORS_CREWS <- BEST_GENRES_ACTORS |>
  group_by(nconst) |>
  summarize(median_castCount = median(castCount, na.rm = TRUE)) |>
  filter(median_castCount >= cast_threshold) |> # Filter for median castCount in the top 5 percentilE
  ungroup()

ACTORS_CAST <- BEST_GENRES_ACTORS |>
  inner_join(ACTORS_CREWS, by = c("nconst" = "nconst")) |>
  select(
    -`median_castCount`
  ) |>
  distinct()

# Let's pull their best works only
  
BEST_FILMS_ACTORS <- ACTORS_CAST |>
  group_by(nconst) |>
  filter(avgRating == max(avgRating, na.rm = TRUE)) |>
  ungroup()

# For actors, let's narrow it down to actors whos best film is from this decade

CURRENT_BEST_ACTORS <- BEST_FILMS_ACTORS |>
  filter(decade==2020)

# We'll now whittle down directors. Who can get a runtime in our sweet-spot of 70 to 150 minutes?

filtered_median_runtime <- BEST_GENRES_DIRECTORS |>
  group_by(nconst) |>
  summarize(median_runtime = median(runtimeMinutes, na.rm = TRUE)) |>  # Calculate median runtime
  filter(median_runtime >= 70 & median_runtime <= 150)  # Filter for median runtime between 70 and 150

# Doing the same but for directors

DIRECTORS_RUNTIME <- BEST_GENRES_DIRECTORS |>
  inner_join(filtered_median_runtime, by = c("nconst" = "nconst"))
# let's pull their best movies only

BEST_FILMS_DIRECTORS <- DIRECTORS_RUNTIME |>
  group_by(nconst) |>
  filter(avgRating == max(avgRating, na.rm = TRUE)) |>
  ungroup()


# Let's remove any director who is also an actor
NON_ACTOR_DIRECTORS <- BEST_FILMS_DIRECTORS |>
  anti_join(BEST_FILMS_ACTORS, by = c("nconst" = "nconst"))

# For directors, let's narrow it down to actors whos best film is from this decade
CURRENT_BEST_DIRECTORS <- NON_ACTOR_DIRECTORS |>
  filter(decade==2020)
dim(NON_ACTOR_DIRECTORS)
[1] 15530    14
dim(BEST_FILMS_ACTORS)
[1] 1371   13

Taking a quick breather! We just got out data down quite a bit. Let’s keep going.

MOST_POPULAR_ACTORS <- CURRENT_BEST_ACTORS |>
  left_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
  select(
    `nconst`,
    `primaryName`,
    `age`,
    `primaryTitle`,
    `year`,
    `mainGenre`,
    avgRating = `avgRating.x`,
    `numVotes`
  ) |>
  arrange(desc(numVotes))


# Let's also bring in how many films they've worked on
ACTOR_FILM_CT <- TITLE_PRINCIPALS |>
  filter(category == "actor") |>
  group_by(nconst) |>
  summarize(film_count = n())


MOST_POPULAR_ACTORS <- MOST_POPULAR_ACTORS |>
  left_join(ACTOR_FILM_CT, by = c("nconst" = "nconst"))

MOST_POPULAR_ACTORS <- MOST_POPULAR_ACTORS |>
  filter(!is.na(film_count))

head(MOST_POPULAR_ACTORS)
# A tibble: 6 × 9
  nconst     primaryName     age primaryTitle  year mainGenre avgRating numVotes
  <chr>      <chr>         <dbl> <chr>        <dbl> <chr>         <dbl>    <dbl>
1 nm8188622  Jacob Batalon    28 Spider-Man:…  2021 Action          8.2   910046
2 nm1069587  Darshan Kuma…    38 The Kashmir…  2022 Drama           8.6   575254
3 nm12220302 Sourav Verma     NA The Kashmir…  2022 Drama           8.6   575254
4 nm1282966  O-T Fagbenle     43 Black Widow   2021 Action          6.6   437506
5 nm0000439  Neil Patrick…    51 The Matrix …  2021 Action          5.6   286445
6 nm1186373  Adam Sztykiel    46 Black Adam    2022 Action          6.2   282231
# ℹ 1 more variable: film_count <int>
# Step 1: Join MOST_POPULAR_ACTORS with TITLE_PRINCIPALS to get tconst (movies the actor was part of)
actor_movies <- MOST_POPULAR_ACTORS |>
  inner_join(TITLE_PRINCIPALS, by = "nconst") |>
  select(nconst, tconst)

# Step 2: Join actor_movies with MOVIES to get the average rating of each movie
actor_movie_ratings <- actor_movies |>
  inner_join(MOVIES, by = "tconst") |>
  select(nconst, tconst, averageRating)

# Step 3: Calculate the average rating for each actor across their movies
actor_avg_rating <- actor_movie_ratings |>
  group_by(nconst) |>
  summarize(avg_rating = mean(averageRating, na.rm = TRUE))

# Step 4: Join the calculated average rating back to MOST_POPULAR_ACTORS
MOST_POPULAR_ACTORS <- MOST_POPULAR_ACTORS |>
  left_join(actor_avg_rating, by = "nconst")

MOST_POPULAR_ACTORS <- MOST_POPULAR_ACTORS |>
  select(
    `nconst`,
    `primaryName`,
    `age`,
    `primaryTitle`,
    `year`,
    `mainGenre`,
    avgMainMovieRating = `avgRating`,
    `numVotes`,
    filmCount = `film_count`,
    avgActorRating = `avg_rating`
  )

MOST_POPULAR_DIRECTORS <- CURRENT_BEST_DIRECTORS |>
  left_join(FILTERED_MOVIES, by = c("tconst" = "tconst")) |>
  select(
    `nconst`,
    `primaryName`,
    `age`,
    `primaryTitle`,
    `year`,
    `mainGenre`,
    avgRating = `avgRating.x`,
    `numVotes`
  ) |>
  arrange(desc(numVotes))

# Let's also bring in how many films they've worked on
DIRECTOR_FILM_CT <- TITLE_PRINCIPALS |>
  filter(category == "director") |>
  group_by(nconst) |>
  summarize(film_count = n())


MOST_POPULAR_DIRECTORS <- MOST_POPULAR_DIRECTORS |>
  left_join(DIRECTOR_FILM_CT, by = c("nconst" = "nconst"))

MOST_POPULAR_DIRECTORS <- MOST_POPULAR_DIRECTORS |>
  filter(!is.na(film_count))

# Step 1: Join MOST_POPULAR_DIRECTORS with TITLE_PRINCIPALS to get tconst (movies the director was part of)
director_movies <- MOST_POPULAR_DIRECTORS |>
  inner_join(TITLE_PRINCIPALS, by = "nconst") |>
  select(nconst, tconst)

# Step 2: Join director_movies with MOVIES to get the average rating of each movie
director_movies_ratings <- director_movies |>
  inner_join(MOVIES, by = "tconst") |>
  select(nconst, tconst, averageRating)

# Step 3: Calculate the average rating for each director across their movies
director_avg_rating <- director_movies_ratings |>
  group_by(nconst) |>
  summarize(avg_rating = mean(averageRating, na.rm = TRUE))

# Step 4: Join the calculated average rating back to MOST_POPULAR_DIRECTORS
MOST_POPULAR_DIRECTORS <- MOST_POPULAR_DIRECTORS |>
  left_join(director_avg_rating, by = "nconst")




MOST_POPULAR_DIRECTORS <- MOST_POPULAR_DIRECTORS |>
  select(
    `nconst`,
    `primaryName`,
    `age`,
    `primaryTitle`,
    `year`,
    `mainGenre`,
    avgMainMovieRating = `avgRating`,
    `numVotes`,
    filmCount = `film_count`,
    avgDirectorRating = `avg_rating`
  )
# Results!
head(MOST_POPULAR_ACTORS)
# A tibble: 6 × 10
  nconst     primaryName     age primaryTitle  year mainGenre avgMainMovieRating
  <chr>      <chr>         <dbl> <chr>        <dbl> <chr>                  <dbl>
1 nm8188622  Jacob Batalon    28 Spider-Man:…  2021 Action                   8.2
2 nm1069587  Darshan Kuma…    38 The Kashmir…  2022 Drama                    8.6
3 nm12220302 Sourav Verma     NA The Kashmir…  2022 Drama                    8.6
4 nm1282966  O-T Fagbenle     43 Black Widow   2021 Action                   6.6
5 nm0000439  Neil Patrick…    51 The Matrix …  2021 Action                   5.6
6 nm1186373  Adam Sztykiel    46 Black Adam    2022 Action                   6.2
# ℹ 3 more variables: numVotes <dbl>, filmCount <int>, avgActorRating <dbl>
dim(MOST_POPULAR_ACTORS)
[1] 474  10
head(MOST_POPULAR_DIRECTORS)
# A tibble: 6 × 10
  nconst    primaryName    age primaryTitle    year mainGenre avgMainMovieRating
  <chr>     <chr>        <dbl> <chr>          <dbl> <chr>                  <dbl>
1 nm1218281 Jon Watts       43 Spider-Man: N…  2021 Action                   8.2
2 nm0001282 Tony Goldwyn    64 Oppenheimer     2023 Drama                    8.3
3 nm1347153 Tyler Perry     55 Don't Look Up   2021 Drama                    7.2
4 nm0411539 Puneet Issar    65 The Kashmir F…  2022 Drama                    8.6
5 nm0426059 Rian Johnson    51 Glass Onion     2022 Drama                    7.1
6 nm0781913 Noah Segan      41 Glass Onion     2022 Drama                    7.1
# ℹ 3 more variables: numVotes <dbl>, filmCount <int>, avgDirectorRating <dbl>
dim(MOST_POPULAR_DIRECTORS)
[1] 3254   10
# Identify the directors with the highest values for numVotes, filmcount, and avgDirectorRating
highest_votes <- MOST_POPULAR_DIRECTORS |> filter(numVotes == max(numVotes, na.rm = TRUE))
highest_film_count <- MOST_POPULAR_DIRECTORS |> filter(filmCount == max(filmCount, na.rm = TRUE))
highest_avg_rating <- MOST_POPULAR_DIRECTORS |> filter(avgDirectorRating == max(avgDirectorRating, na.rm = TRUE))

# Combine the labels into one data frame
labels_df <- bind_rows(highest_votes, highest_film_count, highest_avg_rating)

# Create the scatter plot
ggplot(MOST_POPULAR_DIRECTORS, aes(x = filmCount, y = numVotes, color = avgDirectorRating)) +
  
  geom_point(size = 3) +  # Add points with size 3
  
  scale_color_gradient(low = "blue", high = "red") +  # Color gradient from blue (low) to red (high)
  
  scale_y_continuous(labels = scales::comma, limits = c(0, 500000)) +  # Adjust y-axis to max at 500,000
  
  labs(
    title = "Scatter Plot of Directors' Film Count vs Number of Votes",
    x = "Film Count",
    y = "Number of Votes",
    color = "Avg Director Rating"  # Label for the color legend
  ) +
  
  geom_text(data = labels_df, aes(label = paste0(" ", primaryName)), 
            vjust = -1, color = "black", size = 4, font = "bold") +  # Add labels for the highest points
  
  theme_minimal(base_size = 15) +  # Increase font size for better readability
  theme(
    plot.background = element_rect(fill = "darkgray"), 
    panel.background = element_rect(fill = "darkgray"),  
    axis.line = element_line(color = "white", size = 1.2), 
    axis.text = element_text(color = "white"),  # Set axis text color to white
    axis.title = element_text(color = "white", face = "bold"),  # Bolden axis titles
    plot.title = element_text(hjust = 0.5, color = "white", face = "bold"),  # Center the title and bold it
    axis.text.x = element_text(angle = 45, hjust = 1)  # Rotate x-axis labels if needed
  )

MOST_POPULAR_DIRECTORS
# A tibble: 3,254 × 10
   nconst    primaryName     age primaryTitle  year mainGenre avgMainMovieRating
   <chr>     <chr>         <dbl> <chr>        <dbl> <chr>                  <dbl>
 1 nm1218281 Jon Watts        43 Spider-Man:…  2021 Action                   8.2
 2 nm0001282 Tony Goldwyn     64 Oppenheimer   2023 Drama                    8.3
 3 nm1347153 Tyler Perry      55 Don't Look …  2021 Drama                    7.2
 4 nm0411539 Puneet Issar     65 The Kashmir…  2022 Drama                    8.6
 5 nm0426059 Rian Johnson     51 Glass Onion   2022 Drama                    7.1
 6 nm0781913 Noah Segan       41 Glass Onion   2022 Drama                    7.1
 7 nm3725055 Jessica Henw…    NA Glass Onion   2022 Drama                    7.1
 8 nm0072600 Ned Benson       47 Black Widow   2021 Action                   6.6
 9 nm0075244 Gabriel Beri…    75 Black Widow   2021 Action                   6.6
10 nm0795153 Cate Shortla…    56 Black Widow   2021 Action                   6.6
# ℹ 3,244 more rows
# ℹ 3 more variables: numVotes <dbl>, filmCount <int>, avgDirectorRating <dbl>
# Rian Johnson
MOST_POPULAR_ACTORS |>
  filter(filmCount >= 10, mainGenre == "Action") |>
  arrange(desc(avgActorRating)) |>
  print(n=50) 
# A tibble: 62 × 10
   nconst     primaryName    age primaryTitle  year mainGenre avgMainMovieRating
   <chr>      <chr>        <dbl> <chr>        <dbl> <chr>                  <dbl>
 1 nm2128363  Paul Briggs     50 Raya and th…  2021 Action                   7.3
 2 nm4514080  Bradley Cam…    NA Psycho-Pass…  2023 Action                   7  
 3 nm0239504  Bruce DuBose    64 Psycho-Pass…  2023 Action                   7  
 4 nm3139597  Shawn Gann      43 Psycho-Pass…  2023 Action                   7  
 5 nm5211370  Justin Brin…    33 Black Clove…  2023 Action                   7.4
 6 nm2995162  Artt Butler     55 Mortal Komb…  2022 Action                   6.7
 7 nm0254602  R. Bruce El…    75 Psycho-Pass…  2023 Action                   7  
 8 nm11111075 A.J. Beckles    26 Mobile Suit…  2024 Action                   6.8
 9 nm0000316  Mel Brooks      98 Paws of Fur…  2022 Action                   5.7
10 nm2616557  Ray Chase       37 Demon Slaye…  2023 Action                   7.6
11 nm0954900  Zeno Robins…    31 Mobile Suit…  2024 Action                   6.8
12 nm13506312 Alex Hom        NA New Gods: Y…  2022 Action                   6.6
13 nm0136482  Lorne Cardi…    60 Four Souls …  2023 Action                   7.2
14 nm6770060  Bill Butts      38 Pokémon the…  2020 Action                   6.4
15 nm3739133  Bayardo De …    NA Mortal Komb…  2021 Action                   6.5
16 nm0089707  Steve Blum      64 Black Clove…  2023 Action                   7.4
17 nm0564402  Simon McBur…    67 Wolfwalkers   2020 Action                   8  
18 nm6981965  Kellen Goff     NA Pokémon the…  2020 Action                   6.4
19 nm0196654  Daniel Dae …    NA Raya and th…  2021 Action                   7.3
20 nm0443632  Thierry Kaz…    NA Miraculous:…  2023 Action                   6.1
21 nm0005458  Jason Stath…    57 Fast X        2023 Action                   5.7
22 nm0010075  Jensen Ackl…    46 Batman: The…  2021 Action                   7.2
23 nm0010075  Jensen Ackl…    46 Batman: The…  2021 Action                   7.2
24 nm8188622  Jacob Batal…    28 Spider-Man:…  2021 Action                   8.2
25 nm0224007  John DiMagg…    56 Batman: The…  2021 Action                   7.2
26 nm1716941  Martial Le …    NA Miraculous:…  2023 Action                   6.1
27 nm0541902  Aasif Mandvi    58 Paws of Fur…  2022 Action                   5.7
28 nm1977856  Chin Han        55 Mortal Komb…  2021 Action                   6  
29 nm3112453  Mohammed Am…    43 Black Adam    2022 Action                   6.2
30 nm1684869  Troy Baker      48 Batman: The…  2021 Action                   7.2
31 nm1684869  Troy Baker      48 Batman: The…  2021 Action                   7.2
32 nm0071680  Clé Bennett     NA Four Souls …  2023 Action                   7.2
33 nm2642131  Zach Callis…    27 Batman: The…  2021 Action                   7.2
34 nm0235960  Robin Atkin…    48 Mortal Komb…  2021 Action                   6.5
35 nm2022559  Dave B. Mit…    55 Mortal Komb…  2021 Action                   6.5
36 nm0862916  Tommy Tiern…    55 Wolfwalkers   2020 Action                   8  
37 nm0000408  Jonathan Fr…    72 Catwoman: H…  2022 Action                   5.6
38 nm4464492  Dakota Daul…    NA The Last Vi…  2021 Action                   5  
39 nm2941559  Ike Amadi       NA Mortal Komb…  2021 Action                   6.5
40 nm0000439  Neil Patric…    51 The Matrix …  2021 Action                   5.6
41 nm0146097  James Carte…    70 Pokémon the…  2020 Action                   6.4
42 nm6299248  Bowen Yang      34 The Tiger's…  2024 Action                   5.7
43 nm0570364  Joel McHale     53 Mortal Komb…  2021 Action                   6.5
44 nm0022771  Steve Alter…    61 Trollhunter…  2021 Action                   6.6
45 nm0273159  Alex Ferns      56 Godzilla x …  2024 Action                   6.1
46 nm1189034  Bharat Dabh…    55 Ek Villain …  2022 Action                   4.4
47 nm1167985  Lewis Tan       NA Mortal Komb…  2021 Action                   6  
48 nm1282966  O-T Fagbenle    43 Black Widow   2021 Action                   6.6
49 nm0152059  Vic Chao        NA Trollhunter…  2021 Action                   6.6
50 nm0389491  Bosco Hogan     75 The Legion    2020 Action                   3.2
# ℹ 12 more rows
# ℹ 3 more variables: numVotes <dbl>, filmCount <int>, avgActorRating <dbl>
# Jacob Batalon and John DiMaggio
MOST_POPULAR_ACTORS |>
  filter(primaryName %in% c("Jacob Batalon","John DiMaggio"))
# A tibble: 2 × 10
  nconst    primaryName     age primaryTitle   year mainGenre avgMainMovieRating
  <chr>     <chr>         <dbl> <chr>         <dbl> <chr>                  <dbl>
1 nm8188622 Jacob Batalon    28 Spider-Man: …  2021 Action                   8.2
2 nm0224007 John DiMaggio    56 Batman: The …  2021 Action                   7.2
# ℹ 3 more variables: numVotes <dbl>, filmCount <int>, avgActorRating <dbl>

Task 6: Remaking a classic

FILTERED_MOVIES |>
  filter(year <= 1998, grepl("action",genres,ignore.case=TRUE)) |>
  arrange(desc(numVotes)) |>
  unique() |>
  print(n=30)
# A tibble: 2,120 × 9
# Groups:   tconst [2,120]
   tconst    title        year metric avgRating numVotes decade castCount genres
   <chr>     <chr>       <dbl>  <dbl>     <dbl>    <dbl>  <dbl>     <int> <chr> 
 1 tt0076759 Star Wars:…  1977    0.8       8.6  1474565   1970        24 Action
 2 tt0080684 Star Wars:…  1980    0.8       8.7  1405682   1980        24 Action
 3 tt0110413 Léon: The …  1994    1         8.5  1269895   1990        17 Actio…
 4 tt0103064 Terminator…  1991    0.8       8.6  1198273   1990        22 Action
 5 tt0086190 Star Wars:…  1983    0.8       8.3  1140230   1980        23 Action
 6 tt0107290 Jurassic P…  1993    0.8       8.2  1089588   1990        21 Action
 7 tt0082971 Raiders of…  1981    0.8       8.4  1051637   1980        23 Action
 8 tt0095016 Die Hard     1988    0.8       8.2   960964   1980        22 Actio…
 9 tt0088247 The Termin…  1984    0.8       8.1   942692   1980        19 Action
10 tt0083658 Blade Runn…  1982    1         8.1   836437   1980        22 Actio…
11 tt0097576 Indiana Jo…  1989    0.8       8.2   822476   1980        24 Action
12 tt0090605 Aliens       1986    0.8       8.4   787505   1980        20 Action
13 tt0113277 Heat         1995    0.6       8.3   733993   1990        22 Actio…
14 tt0116629 Independen…  1996    0.8       7     616858   1990        20 Action
15 tt0087469 Indiana Jo…  1984    0.8       7.5   542540   1980        23 Action
16 tt0119116 The Fifth …  1997    0.8       7.6   515969   1990        19 Action
17 tt0092099 Top Gun      1986    1         6.9   513973   1980        22 Actio…
18 tt0117060 Mission: I…  1996    0.8       7.2   478193   1990        22 Actio…
19 tt0119567 The Lost W…  1997    0.8       6.6   450702   1990        21 Action
20 tt0119698 Princess M…  1997    0.8       8.3   444550   1990        29 Action
21 tt0112864 Die Hard w…  1995    0.8       7.6   413230   1990        20 Actio…
22 tt0096895 Batman       1989    0.8       7.5   411928   1980        23 Action
23 tt0119094 Face/Off     1997    0.8       7.3   407477   1990        23 Action
24 tt0111257 Speed        1994    0.8       7.3   399472   1990        19 Actio…
25 tt0099423 Die Hard 2   1990    0.8       7.1   387990   1990        24 Actio…
26 tt0117500 The Rock     1996    0.8       7.4   363620   1990        23 Actio…
27 tt0100802 Total Reca…  1990    0.8       7.5   360446   1990        25 Action
28 tt0103776 Batman Ret…  1992    0.8       7.1   333509   1990        24 Action
29 tt0120201 Starship T…  1997    0.8       7.3   328003   1990        22 Action
30 tt0106977 The Fugiti…  1993    1         7.8   325363   1990        24 Actio…
# ℹ 2,090 more rows
# Independence Day seems like a good choice, let's make sure there's only one of it though

FILTERED_MOVIES |>
  filter(title == "Independence Day")
# A tibble: 2 × 9
# Groups:   tconst [2]
  tconst    title         year metric avgRating numVotes decade castCount genres
  <chr>     <chr>        <dbl>  <dbl>     <dbl>    <dbl>  <dbl>     <int> <chr> 
1 tt0085724 Independenc…  1983    1         6.5      585   1980        21 Drama 
2 tt0116629 Independenc…  1996    0.8       7     616858   1990        20 Action

Looks like we’re good to go with this remake using this cast and crew!